#### Evaluate the month range, EDD thresholds, and variable inclusion,
#### using leave-one-out state cross-validation.

### Prepare environment

setwd("~/research/coffee-dataverse/src")

library(ggplot2)
library(dplyr)
library(lfe)
library(splines)

## Load and prepare the data

df <- read.csv("../data/observations-full.csv", fileEncoding='utf-8')
df$logyield <- log(df$total.produce / df$total.harvest)
df$logyield[!is.finite(df$logyield)] <- NA

df$tmin <- 0
df$gdd <- 0
df$edd <- 0
df$prcp <- 0
for (suffix in c('12.1', paste0('0', 1:5))) {
    df$tmin <- df$tmin + df[, paste0('tmin', suffix)]
    df$gdd <- df$gdd + df[, paste0('above0', suffix)] - df[, paste0('above32', suffix)]
    df$edd <- df$edd + df[, paste0('above32', suffix)]
    df$prcp <- df$prcp + df[, paste0('prcp', suffix)]
}
df$tmin <- df$tmin / 6
df$prcp2 <- df$prcp^2
df$gdd1000 <- df$gdd / 1000
df$edd1000 <- df$edd / 1000

df$state <- gsub("^(.+)\\((..)\\)", "\\2", df$Munic�pio)

summary(felm(logyield ~ gdd1000 + edd1000 + tmin + prcp + prcp2 + state:ns(year, df=3) | Munic�pio + year | 0 | Munic�pio, data=df))

## Cross-validation on the EDD thresholds and month

thresholds <- c(0, 10, 20, 28, 30, 32, 34)
monthsuffs <- c(paste0('0', 1:9, '.1'), paste0(10:12, '.1'), paste0('0', 1:9), 10:12)

## Remove state:ns(year, df=3) from all
df.demean <- df[, c('Munic�pio', 'year', 'total.produce', 'total.harvest', 'state', 'logyield')]
for (col in c('below0', paste0('above', thresholds), 'tmin', 'tmax', 'tavg', 'prcp', 'logyield')) {
    print(col)
    if (col == 'logyield')
        monthcols <- col
    else
        monthcols <- paste0(col, monthsuffs)

    for (monthcol in monthcols) {
        ## mod <- felm(as.formula(paste(monthcol, "~ state:ns(year, df=3) | Munic�pio + factor(year)")), data=df)
        mod <- felm(as.formula(paste(monthcol, "~ state:ns(year, df=3) | Munic�pio")), data=df)
        df.demean[, monthcol] <- NA
        df.demean[-mod$na.action, monthcol] <- mod$resid
    }
}

## Determine subsets for cross-valiation, using area by state
(df %>% group_by(state) %>% summarize(total.harvest=sum(total.harvest)) %>% arrange(-total.harvest))
largest <- (df %>% group_by(state) %>% summarize(total.harvest=sum(total.harvest)) %>% arrange(-total.harvest))$state[1:13]
allrest <- unique(df$state[!(df$state %in% largest)])
sum(df$total.harvest[df$state %in% allrest])
df.demean$xval.state <- df.demean$state
df.demean$xval.state[!(df.demean$state %in% largest)] <- "XX"

## Perform the cross-validation dropping one state at a time
drop.state.xval <- function(formula, df.demean) {
    allerrors <- rep(NA, nrow(df.demean))
    for (drop.state in c(largest, 'XX')) {

        df.train <- subset(df.demean, xval.state != drop.state)
        df.tests <- subset(df.demean, xval.state == drop.state)

        mod <- lm(formula, data=df.train)
        logyield.tests <- predict(mod, df.tests)
        allerrors[df.demean$xval.state == drop.state] <- df.tests$logyield - logyield.tests
    }

    sqrt(mean(allerrors^2, na.rm=T))
}

## Load the results (to be appended to)
if (file.exists("crossval.csv")) {
    results <- read.csv("crossval.csv", colClasses=c('character', 'character'))

    results$include.below0 <- as.logical(results$include.below0)
    results$include.tmin <- as.logical(results$include.tmin)
    results$include.tmax <- as.logical(results$include.tmax)
    results$include.tavg <- as.logical(results$include.tavg)
    results$include.gedd <- as.logical(results$include.gedd)
    results$include.prcp <- as.logical(results$include.prcp)
    results$threshold.gdd <- as.numeric(as.character(results$threshold.gdd))
    results$threshold.edd <- as.numeric(as.character(results$threshold.edd))
    results$rmse <- as.numeric(as.character(results$rmse))
} else
    results <- data.frame()

## Iterate through all combinations of months, variables, and thresholds
for (mm1 in 1:length(monthsuffs))
    for (mm2 in mm1:length(monthsuffs)) {
        for (col in c('below0', paste0('above', thresholds), 'tmin', 'tmax', 'tavg', 'prcp')) {
            df.demean[, col] <- 0
            for (monthsuff in monthsuffs[mm1:mm2])
                df.demean[, col] <- df.demean[, col] + df.demean[, paste0(col, monthsuff)]
        }
        df.demean$prcp2 <- df.demean$prcp^2

        for (include.below0 in !c(T, F))
            for (include.tmin in c(T, F))
                for (include.tmax in !c(T, F))
                    for (include.tavg in !c(T, F)) {
                        if ((include.tmin && include.tmax && include.tavg) || ((include.tmin || include.tmax) && include.tavg))
                            next
                        for (include.gedd in c(T, F))
                            for (include.prcp in c(T, F))
                                if (include.gedd) {
                                    for (threshold.gdd in c(0, 10, 20))
                                        for (threshold.edd in c(28, 30, 32, 34)) {
                                            ## Check if already in results
                                            if (any(results$month1 == monthsuffs[mm1] & results$month2 == monthsuffs[mm2] & results$include.below0 == include.below0 & results$include.tmin == include.tmin & results$include.tmax == include.tmax & results$include.tavg == include.tavg & results$include.gedd == include.gedd & results$include.prcp == include.prcp & results$threshold.gdd == threshold.gdd & results$threshold.edd == threshold.edd))
                                                next

                                            df.demean$gdd <- df.demean[, paste0('above', threshold.gdd)] - df.demean[, paste0('above', threshold.edd)]
                                            df.demean$edd <- df.demean[, paste0('above', threshold.edd)]
                                            allvars <- c(0, ifelse(include.below0, 'below0', NA),
                                                         ifelse(include.tmin, 'tmin', NA),
                                                         ifelse(include.tmax, 'tmax', NA),
                                                         ifelse(include.tavg, 'tavg', NA),
                                                         ifelse(include.gedd, 'gdd', NA),
                                                         ifelse(include.prcp, 'prcp', NA))
                                            allvars <- allvars[!is.na(allvars)]
                                            if ('gdd' %in% allvars)
                                                allvars <- c(allvars, 'edd')
                                            if ('prcp' %in% allvars)
                                                allvars <- c(allvars, 'prcp2')
                                            avform <- as.formula(paste0("logyield ~ ", paste(allvars, collapse=' + ')))
                                            rmse <- drop.state.xval(avform, df.demean)
                                            resrow <- data.frame(month1=monthsuffs[mm1], month2=monthsuffs[mm2], include.below0, include.tmin, include.tmax, include.tavg, include.gedd, include.prcp, threshold.gdd, threshold.edd, rmse)
                                            print(resrow)
                                            results <- rbind(results, resrow)
                                        }
                                } else {
                                    ## Check if already in results
                                    if (any(results$month1 == monthsuffs[mm1] & results$month2 == monthsuffs[mm2] & results$include.below0 == include.below0 & results$include.tmin == include.tmin & results$include.tmax == include.tmax & results$include.tavg == include.tavg & results$include.gedd == include.gedd & results$include.prcp == include.prcp & is.na(results$threshold.gdd) & is.na(results$threshold.edd)))
                                        next

                                    allvars <- c(0, ifelse(include.below0, 'below0', NA),
                                                 ifelse(include.tmin, 'tmin', NA),
                                                 ifelse(include.tmax, 'tmax', NA),
                                                 ifelse(include.tavg, 'tavg', NA),
                                                 ifelse(include.prcp, c('prcp', 'prcp2'), NA))
                                    allvars <- allvars[!is.na(allvars)]
                                    if ('gdd' %in% allvars)
                                        allvars <- c(allvars, 'edd')
                                    if ('prcp' %in% allvars)
                                        allvars <- c(allvars, 'prcp2')
                                    avform <- as.formula(paste0("logyield ~ ", paste(allvars, collapse=' + ')))
                                    rmse <- drop.state.xval(avform, df.demean)
                                    resrow <- data.frame(month1=monthsuffs[mm1], month2=monthsuffs[mm2], include.below0, include.tmin, include.tmax, include.tavg, include.gedd, include.prcp, threshold.gdd=NA, threshold.edd=NA, rmse)
                                    print(resrow)
                                    results <- rbind(results, resrow)
                                }
                    }
        write.csv(results, "crossval.csv", row.names=F)
    }
